Add propagate to predict prophage activity#110
Add propagate to predict prophage activity#110CarsonJM wants to merge 14 commits intonf-core:devfrom
Conversation
|
adamrtalbot
left a comment
There was a problem hiding this comment.
Nothing deal breaking:
- avoid loops for creating pandas DataFrames, it causes exponential memory increase for bigger samples
- Got a few Groovy things within map statements that I think can be done better. They might cause problems with crossed memory stuff.
- The Nextflow itself is good.
I think these are improvements but none are blocking.
There was a problem hiding this comment.
This feels like a big overhead for what it's doing. Is it just checking the FASTA header is in the TSV? We should be able to streamline this...
| # identify checkv provirus coordinates | ||
| checkv_coords = pd.DataFrame() | ||
| for index, row in checkv_proviruses.iterrows(): | ||
| contig_id = row['contig_id'] | ||
| region_types = row['region_types'].split(',') | ||
| provirus_count = 0 | ||
| # parse though regions for each contig | ||
| for i in range(len(region_types)): | ||
| if region_types[i] == 'viral': | ||
| # if a region is viral, extract contig name, assign a provirus id, and add checkv start/end coords | ||
| provirus_info = pd.DataFrame() | ||
| provirus_count += 1 | ||
| provirus_info['seq_name'] = [contig_id] | ||
| provirus_info['provirus_id'] = [contig_id + '|checkv_provirus_' + str(provirus_count)] | ||
| provirus_info[['provirus_start', 'provirus_stop']] = [row['region_coords_bp'].split(',')[i].split('-')] | ||
| checkv_coords = pd.concat([checkv_coords, provirus_info], axis=0) |
There was a problem hiding this comment.
This is a fairly inefficient way of creating a dataframe. Better to create a list of DataFrames then concat once:
pd.concat([checkProvirusCoords(inputs) for inputs in checkv_proviruses])
Not a blocker because I doubt this is slow - but worth noting.
| else: | ||
| provirus_coords['fragment'] = row['seq_name'] + '|provirus_' + str(provirus_coords['start'][0]) + '_' + str(provirus_coords['stop'][0]) | ||
| # concatenate all provirus coordinates | ||
| provirus_combined_coords = pd.concat([provirus_combined_coords, provirus_coords], axis=0) |
|
|
||
|
|
||
| test("fasta.gz") { | ||
| test("fasta.gz + 95 + 0 + 85") { |
There was a problem hiding this comment.
Could you use a more clear name here, like params?
| test("fasta.gz + 95 + 0 + 85") { | |
| test("fasta.gz, anicluster_min_ani = 95, anicluster_min_qcov = 0, anicluster_min_tcov = 85") { |
| except IndexError: | ||
| pass | ||
|
|
||
| sys.stderr.write("\nError: -v coordinates file is formatted incorrectly. See README for details. Exiting.\n") |
There was a problem hiding this comment.
Using logging would be more appropriate than sys.stderr
There was a problem hiding this comment.
Or if it's an error, raise YourException("message")
| if any(exist): | ||
| exit(1) | ||
|
|
||
| vibe_header = prophages_check(vibe) | ||
| if not vibe_header: | ||
| exit(1) | ||
|
|
||
| # verify inputs | ||
| check = [samfile, bamfile, forward, interleaved, unpaired] | ||
| check = [c for c in check if c != ''] | ||
| if len(check) > 1 or not check: | ||
| sys.stderr.write(f"\nOnly one input file (-s, -b, -r, -i, -u) is allowed. {len(check)} provided. Exiting.\n") | ||
| exit(1) | ||
|
|
||
| if forward and reverse: | ||
| if not forward.endswith('.fastq') and not forward.endswith('.fastq.gz'): | ||
| sys.stderr.write("\nError: Provided paired reads files must both have the extension .fastq or .fastq.gz. Exiting.\n") | ||
| sys.stderr.write(f"{forward}\n") | ||
| exit(1) | ||
| if not reverse.endswith('.fastq') and not reverse.endswith('.fastq.gz'): | ||
| sys.stderr.write("\nError: Provided paired reads files must both have the extension .fastq or .fastq.gz. Exiting.\n") | ||
| sys.stderr.write(f"{reverse}\n\n") | ||
| exit(1) | ||
| if interleaved: | ||
| if not interleaved.endswith('.fastq') and not interleaved.endswith('.fastq.gz'): | ||
| sys.stderr.write("\nError: Provided interleaved reads file must have the extension .fastq or .fastq.gz. Exiting.\n") | ||
| sys.stderr.write(f"{interleaved}\n\n") | ||
| exit(1) | ||
| if unpaired: | ||
| if not unpaired.endswith('.fastq') and not unpaired.endswith('.fastq.gz'): | ||
| sys.stderr.write("\nError: Provided unpaired reads file must have the extension .fastq or .fastq.gz. Exiting.\n") | ||
| sys.stderr.write(f"{unpaired}\n\n") | ||
| exit(1) | ||
|
|
||
| if samfile: | ||
| if not_exist(samfile, 'sam file'): | ||
| exit(1) | ||
| if not bamfile.endswith('.sam'): | ||
| sys.stderr.write("\nError: Provided sam file must have the extension .sam. Exiting.\n") | ||
| exit(1) | ||
| if bamfile: | ||
| if not_exist(bamfile, 'bam file'): | ||
| exit(1) | ||
| if not bamfile.endswith('.bam'): | ||
| sys.stderr.write("\nError: Provided bam file must have the extension .bam. Exiting.\n") | ||
| exit(1) | ||
|
|
||
| if effect < 0.6: | ||
| sys.stderr.write("\nError: Cohen's d effect size (-e) should not be set below 0.6. Exiting.\n") | ||
| exit(1) | ||
| if min_breadth > 1: | ||
| sys.stderr.write("\nError: breadth (--breadth) should be a decimal value <= 1. Exiting.\n") | ||
| exit(1) | ||
| if ratio_cutoff < 1.5: | ||
| sys.stderr.write("\nError: ratio cutoff (-c) should not be set below 1.5. Exiting.\n") | ||
| exit(1) | ||
| if read_id > 1: | ||
| sys.stderr.write("\nError: percent identity (-p) should be a decimal value <= 1. Exiting.\n") | ||
| exit(1) | ||
| read_id = 1.0 - read_id |
There was a problem hiding this comment.
Better to raise an error instead of exit, it's more robust and better to handle. exit can cause the process to exit prematurely.
PR checklist
nf-core lint).nf-test test main.nf.test -profile test,docker).nextflow run . -profile debug,test,docker --outdir <OUTDIR>).docs/usage.mdis updated.docs/output.mdis updated.CHANGELOG.mdis updated.README.mdis updated (including new tool citations and authors/contributors).